library(tidyverse)
library(caret)
library(lattice)
library(DataExplorer)

# Get barstool data off github
job_post <- read_csv("https://raw.githubusercontent.com/luizmalpele/datasets/master/fake_job_postings.csv")
Warning: closing unused connection 3 (https://raw.githubusercontent.com/luizmalpele/StatsLearningProject/master/data/fake_job_postings.csv)
Parsed with column specification:
cols(
  job_id = col_double(),
  title = col_character(),
  location = col_character(),
  department = col_character(),
  salary_range = col_character(),
  company_profile = col_character(),
  description = col_character(),
  requirements = col_character(),
  benefits = col_character(),
  telecommuting = col_double(),
  has_company_logo = col_double(),
  has_questions = col_double(),
  employment_type = col_character(),
  required_experience = col_character(),
  required_education = col_character(),
  industry = col_character(),
  `function` = col_character(),
  fraudulent = col_double()
)
job_post

Plot Missing Values

plot_missing(job_post)

Missing Information Comparison

job_post %>%
  select(fraudulent, department, required_education, benefits, required_experience,salary_range, location, requirements, company_profile, employment_type, industry) %>% 
  group_by(fraudulent) %>% 
  summarize(na_ratio_salary = sum(is.na(salary_range))/length(salary_range),
            na_ratio_department = sum(is.na(department))/length(department),
            na_ratio_required_education = sum(is.na(required_education))/length(required_education),
            na_ratio_benefits = sum(is.na(benefits))/length(benefits),
            na_ratio_requirements = sum(is.na(requirements))/length(requirements),
            na_ratio_company_profile = sum(is.na(company_profile))/length(company_profile),
            na_ratio_location = sum(is.na(location))/length(location),
            na_ratio_employment_type = sum(is.na(employment_type))/length(employment_type),
            na_ratio_industry = sum(is.na(industry))/length(industry)
            )

The variables that presented higher missing information ratio are: company_profile and employment_type.

job_post %>% group_by(fraudulent) %>% 
  summarize(ratio_has_questions = sum(has_questions)/length(has_questions),
            ratio_has_company_logo = sum(has_company_logo)/length(has_company_logo),
            ratio_telecommuting = sum(telecommuting)/length(telecommuting)) 

The variables that presented higher missing information ratio are: ratio_has_questions and ratio_has_company_logo. The next step is to investigate the titles and decriptions using data mining and text mining tecniques.

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkobGF0dGljZSkNCmxpYnJhcnkoRGF0YUV4cGxvcmVyKQ0KDQojIEdldCBiYXJzdG9vbCBkYXRhIG9mZiBnaXRodWINCmpvYl9wb3N0IDwtIHJlYWRfY3N2KCJodHRwczovL3Jhdy5naXRodWJ1c2VyY29udGVudC5jb20vbHVpem1hbHBlbGUvZGF0YXNldHMvbWFzdGVyL2Zha2Vfam9iX3Bvc3RpbmdzLmNzdiIpDQpgYGANCg0KYGBge3J9DQpqb2JfcG9zdA0KYGBgDQoNCiMjIyBQbG90IE1pc3NpbmcgVmFsdWVzDQoNCmBgYHtyfQ0KcGxvdF9taXNzaW5nKGpvYl9wb3N0KQ0KYGBgDQoNCiMjIyBNaXNzaW5nIEluZm9ybWF0aW9uIENvbXBhcmlzb24NCg0KYGBge3J9DQpqb2JfcG9zdCAlPiUNCiAgc2VsZWN0KGZyYXVkdWxlbnQsIGRlcGFydG1lbnQsIHJlcXVpcmVkX2VkdWNhdGlvbiwgYmVuZWZpdHMsIHJlcXVpcmVkX2V4cGVyaWVuY2Usc2FsYXJ5X3JhbmdlLCBsb2NhdGlvbiwgcmVxdWlyZW1lbnRzLCBjb21wYW55X3Byb2ZpbGUsIGVtcGxveW1lbnRfdHlwZSwgaW5kdXN0cnkpICU+JSANCiAgZ3JvdXBfYnkoZnJhdWR1bGVudCkgJT4lIA0KICBzdW1tYXJpemUobmFfcmF0aW9fc2FsYXJ5ID0gc3VtKGlzLm5hKHNhbGFyeV9yYW5nZSkpL2xlbmd0aChzYWxhcnlfcmFuZ2UpLA0KICAgICAgICAgICAgbmFfcmF0aW9fZGVwYXJ0bWVudCA9IHN1bShpcy5uYShkZXBhcnRtZW50KSkvbGVuZ3RoKGRlcGFydG1lbnQpLA0KICAgICAgICAgICAgbmFfcmF0aW9fcmVxdWlyZWRfZWR1Y2F0aW9uID0gc3VtKGlzLm5hKHJlcXVpcmVkX2VkdWNhdGlvbikpL2xlbmd0aChyZXF1aXJlZF9lZHVjYXRpb24pLA0KICAgICAgICAgICAgbmFfcmF0aW9fYmVuZWZpdHMgPSBzdW0oaXMubmEoYmVuZWZpdHMpKS9sZW5ndGgoYmVuZWZpdHMpLA0KICAgICAgICAgICAgbmFfcmF0aW9fcmVxdWlyZW1lbnRzID0gc3VtKGlzLm5hKHJlcXVpcmVtZW50cykpL2xlbmd0aChyZXF1aXJlbWVudHMpLA0KICAgICAgICAgICAgbmFfcmF0aW9fY29tcGFueV9wcm9maWxlID0gc3VtKGlzLm5hKGNvbXBhbnlfcHJvZmlsZSkpL2xlbmd0aChjb21wYW55X3Byb2ZpbGUpLA0KICAgICAgICAgICAgbmFfcmF0aW9fbG9jYXRpb24gPSBzdW0oaXMubmEobG9jYXRpb24pKS9sZW5ndGgobG9jYXRpb24pLA0KICAgICAgICAgICAgbmFfcmF0aW9fZW1wbG95bWVudF90eXBlID0gc3VtKGlzLm5hKGVtcGxveW1lbnRfdHlwZSkpL2xlbmd0aChlbXBsb3ltZW50X3R5cGUpLA0KICAgICAgICAgICAgbmFfcmF0aW9faW5kdXN0cnkgPSBzdW0oaXMubmEoaW5kdXN0cnkpKS9sZW5ndGgoaW5kdXN0cnkpDQogICAgICAgICAgICApDQpgYGANCg0KVGhlIHZhcmlhYmxlcyB0aGF0IHByZXNlbnRlZCBoaWdoZXIgbWlzc2luZyBpbmZvcm1hdGlvbiByYXRpbyBhcmU6IF9jb21wYW55X3Byb2ZpbGVfIGFuZCBfZW1wbG95bWVudF90eXBlXy4NCg0KYGBge3J9DQpqb2JfcG9zdCAlPiUgZ3JvdXBfYnkoZnJhdWR1bGVudCkgJT4lIA0KICBzdW1tYXJpemUocmF0aW9faGFzX3F1ZXN0aW9ucyA9IHN1bShoYXNfcXVlc3Rpb25zKS9sZW5ndGgoaGFzX3F1ZXN0aW9ucyksDQogICAgICAgICAgICByYXRpb19oYXNfY29tcGFueV9sb2dvID0gc3VtKGhhc19jb21wYW55X2xvZ28pL2xlbmd0aChoYXNfY29tcGFueV9sb2dvKSwNCiAgICAgICAgICAgIHJhdGlvX3RlbGVjb21tdXRpbmcgPSBzdW0odGVsZWNvbW11dGluZykvbGVuZ3RoKHRlbGVjb21tdXRpbmcpKSANCmBgYA0KDQpUaGUgdmFyaWFibGVzIHRoYXQgcHJlc2VudGVkIGhpZ2hlciBtaXNzaW5nIGluZm9ybWF0aW9uIHJhdGlvIGFyZTogX3JhdGlvX2hhc19xdWVzdGlvbnNfIGFuZCBfcmF0aW9faGFzX2NvbXBhbnlfbG9nb18uIFRoZSBuZXh0IHN0ZXAgaXMgdG8gaW52ZXN0aWdhdGUgdGhlIHRpdGxlcyBhbmQgZGVjcmlwdGlvbnMgdXNpbmcgX19kYXRhIG1pbmluZyBhbmQgdGV4dCBtaW5pbmdfXyB0ZWNuaXF1ZXMu